Libraries

Load required libraries

library(tidyverse)
── Attaching core tidyverse packages ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(here)
G3;here() starts at /Users/petermacpherson/Documents/Documents - Peter’s Mac mini/Projects/AQ-MTB_2025-05-14/Work/Data/aq_mtb
g
library(brms)
G3;Loading required package: Rcpp
gG3;Loading 'brms' package (version 2.22.0). Useful instructions
can be found by typing help('brms'). A more detailed introduction
to the package is available through vignette('brms_overview').
gG3;
Attaching package: ‘brms’

gG3;The following object is masked from ‘package:stats’:

    ar

g
library(tidybayes)
G3;
Attaching package: ‘tidybayes’

gG3;The following objects are masked from ‘package:brms’:

    dstudent_t, pstudent_t, qstudent_t, rstudent_t

g
library(sf)
G3;Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
g
library(mgcv)
G3;Loading required package: nlme
gG3;
Attaching package: ‘nlme’

gG3;The following object is masked from ‘package:dplyr’:

    collapse

gG3;This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
gG3;
Attaching package: ‘mgcv’

gG3;The following objects are masked from ‘package:brms’:

    s, t2

g
library(priorsense)
library(osmdata)
G3;Data (c) OpenStreetMap contributors, ODbL 1.0. https://www.openstreetmap.org/copyright
g
library(nngeo)
G3;Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
g
library(glue)
library(arrow)
G3;
Attaching package: ‘arrow’

gG3;The following object is masked from ‘package:lubridate’:

    duration

gG3;The following object is masked from ‘package:utils’:

    timestamp

g

Import data

Import the modelling dataset - cleaned and prepped in the 2025-05-12_aq_clean.Rmd script


aq_model_data <- read_rds("input_data/aq_model_data.rds")

#also load the scale clusters
load("input_data/scale_72_clusters.rda") #SCALE clusters

#and the grid data
mw_100m_cropped <- read_rds("input_data/mw_100m_cropped.rds")
mw_100m_grid_sf <- read_rds("input_data/mw_100m_grid_sf.rds")

Functions

get_st_predictions(): A function to take posterior draws for a prediction matrix, write to an arrow database (to handle exhausting memory), then summarise

get_st_predictions <- function(model, nd, model_id, out_dir, 
                               ndraws = 1000, chunk_size = 10000, 
                               who_pm25_limit = 25,
                               summarise_exposure_by = "grid_id",
                               write_summary = TRUE) {
  dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
  nd <- nd %>% dplyr::mutate(.row = dplyr::row_number())
  n_chunks <- ceiling(nrow(nd) / chunk_size)

  nd_file <- file.path(out_dir, paste0("nd_", model_id, ".parquet"))
  arrow::write_parquet(nd, nd_file)

  for (i in seq_len(n_chunks)) {
    cat("Processing chunk", i, "of", n_chunks, "\n")
    idx <- ((i - 1) * chunk_size + 1):min(i * chunk_size, nrow(nd))
    nd_chunk <- nd[idx, ]

    epred_array <- posterior_epred(model, newdata = nd_chunk, ndraws = ndraws)

    epred_df <- as.data.frame.table(epred_array, responseName = "epred") %>%
      dplyr::rename(draw = Var1, row_in_chunk = Var2) %>%
      dplyr::mutate(
        draw = as.integer(draw),
        row_in_chunk = as.integer(row_in_chunk),
        .row = idx[row_in_chunk],
        model_id = model_id
      ) %>%
      dplyr::select(draw, .row, epred, model_id)

    arrow::write_parquet(
      epred_df,
      sink = file.path(out_dir, glue::glue("epred_chunk_{i}.parquet"))
    )

    rm(epred_array, epred_df, nd_chunk)
    gc()
  }

  # Load full draws and metadata for summarisation
  ds <- arrow::open_dataset(out_dir)
  nd_ds <- arrow::open_dataset(nd_file)

  joined <- ds %>%
    dplyr::left_join(nd_ds, by = ".row") %>%
    dplyr::mutate(
      epred_pm25 = exp(epred),
      days = lubridate::days_in_month(date),
      exceed = epred_pm25 > who_pm25_limit
    )

  # Per-.row posterior summaries (as before)
  row_summary <- joined %>%
    dplyr::group_by(.row) %>%
    dplyr::summarise(
      mean_log_epred = mean(epred, na.rm = TRUE),
      sd_log_epred = sd(epred, na.rm = TRUE),
      lwr_log = quantile(epred, 0.025, na.rm = TRUE),
      upr_log = quantile(epred, 0.975, na.rm = TRUE),

      mean_epred = mean(epred_pm25, na.rm = TRUE),
      sd_epred = sd(epred_pm25, na.rm = TRUE),
      lwr = quantile(epred_pm25, 0.025, na.rm = TRUE),
      upr = quantile(epred_pm25, 0.975, na.rm = TRUE),

      prob_exceed = mean(exceed, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    dplyr::collect() %>%
    dplyr::left_join(nd, by = ".row") %>%
    dplyr::mutate(date = lubridate::month(date, label = TRUE))

  # Additional exposure summaries across draws
  exposure_summary <- joined %>%
    dplyr::group_by_at(c(summarise_exposure_by, "draw")) %>%
    dplyr::summarise(
      cum_exposure = sum(epred_pm25 * days, na.rm = TRUE),
      mean_exposure = mean(epred_pm25, na.rm = TRUE),
      prob_exceed = mean(exceed, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    dplyr::collect() %>%
    dplyr::group_by_at(summarise_exposure_by) %>%
    dplyr::summarise(
      mean_pm25 = mean(mean_exposure),
      sd_pm25 = sd(mean_exposure),
      lwr_pm25 = quantile(mean_exposure, 0.025),
      upr_pm25 = quantile(mean_exposure, 0.975),

      mean_cum = mean(cum_exposure),
      sd_cum = sd(cum_exposure),
      lwr_cum = quantile(cum_exposure, 0.025),
      upr_cum = quantile(cum_exposure, 0.975),

      prob_exceed = mean(prob_exceed),
      .groups = "drop"
    )

  # Optionally write to disk
  if (write_summary) {
    arrow::write_parquet(row_summary, file.path(out_dir, paste0("summary_", model_id, "_row.parquet")))
    arrow::write_parquet(exposure_summary, file.path(out_dir, paste0("summary_", model_id, "_exposure.parquet")))
  }

  return(list(row_summary = row_summary, exposure_summary = exposure_summary))
}

plot_st_predictions(): A function to plot spatiotemporal predictions from model posteriors

compute_exposure_metrics() calculates exposure metrics for the 1 year p

PM2.5

First we will model PM2.5

Model 1: Day of year only

“We fitted a spatially smooth regression model for log-transformed PM2.5 concentrations using a Gaussian process (GP) smooth over spatial coordinates (x, y) to capture flexible spatial trends. Seasonal variation was modelled using a Fourier series expansion of day-of-year up to the 3rd harmonic (i.e., sine and cosine terms for annual, semi-annual, and tri-annual cycles) to account for complex seasonal patterns. The model was fitted in a Bayesian framework using brms with a Gaussian likelihood and the cmdstanr backend.”


priors <- c(
  prior(normal(3.4, 1), class = "Intercept"),
  prior(normal(0,1), class = "b"),
  prior(exponential(1), class = "sigma"),
  prior(exponential(1), class = "sdgp"),
  prior(normal(0, 1), class = "lscale", lb = 0)
)

Fit prior only model


m1_prior <- brm(
    formula = log_pm2_5 ~ gp(x, y, k=15) +
      sin_doy1 + cos_doy1 +
      sin_doy2 + cos_doy2 +
      sin_doy3 + cos_doy3,
    data = aq_model_data,
    family = gaussian(),
    prior = priors,
    sample_prior = "only",
    chains = 4, cores = 4,
    backend = "cmdstanr"
  )
G2;H2;Warningh: The global prior 'normal(0, 1)' of class 'lscale' will not be used in the model as all related coefficients have individual priors already. If you did not set those priors yourself, then maybe brms has assigned default priors. See ?set_prior and ?default_prior for more details.g
G3;Compiling Stan program...
g

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

 
G3;Start sampling
g
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 1 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 1 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 1 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 1 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 1 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 1 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 1 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 1 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 1 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 1 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 1 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 2 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 2 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 2 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 2 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 2 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 2 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 2 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 2 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 2 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 2 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 3 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 3 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 3 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 3 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 3 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 3 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 3 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 3 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 3 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 3 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 4 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 4 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 4 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 4 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 4 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 4 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 4 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 4 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 4 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 4 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 1 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 1 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 1 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 1 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 2 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 2 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 2 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 2 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 3 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 3 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 3 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 3 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 3 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 4 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 4 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 4 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 4 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 4 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 0.2 seconds.
Chain 2 finished in 0.2 seconds.
Chain 3 finished in 0.2 seconds.
Chain 4 finished in 0.2 seconds.

All 4 chains finished successfully.
Mean chain execution time: 0.2 seconds.
Total execution time: 0.4 seconds.
G3;Loading required namespace: rstan
g
# summary(m1_prior)
# plot(m1_prior)

Now model with data.

m1 <- brm(
    formula = log_pm2_5 ~ gp(x, y, k=15)  +
    sin_doy1 + cos_doy1 +
    sin_doy2 + cos_doy2 +
    sin_doy3 + cos_doy3,
    data = aq_model_data,
    family = gaussian(),
    prior = priors,
    chains = 4, cores = 4,
    backend = "cmdstanr"
  )
G2;H2;Warningh: The global prior 'normal(0, 1)' of class 'lscale' will not be used in the model as all related coefficients have individual priors already. If you did not set those priors yourself, then maybe brms has assigned default priors. See ?set_prior and ?default_prior for more details.g
G3;Model executable is up to date!
gG3;Start sampling
g
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 1 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 3 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 4 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 1 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 2 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 4 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 3 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 3 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 1 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 2 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 4 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 3 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 4 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 1 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 2 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 4 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 3 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 4 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 1 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 3 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 2 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 4 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 1 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 3 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 2 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 4 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 1 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 3 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 2 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 4 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 1 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 3 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 2 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 4 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 1 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 3 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 2 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 4 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 1 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 4 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 3 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 2 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 4 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 1 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 4 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 3 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 2 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 4 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 1 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 4 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 3 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 2 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 4 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 91.6 seconds.
Chain 3 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 1 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 3 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 2 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 1 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 3 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 2 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 1 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 3 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 2 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 1 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 3 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 2 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 124.1 seconds.
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 125.7 seconds.
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 127.5 seconds.

All 4 chains finished successfully.
Mean chain execution time: 117.2 seconds.
Total execution time: 127.7 seconds.
# summary(m1)
# conditional_effects(m1)
# pp_check(m1)
# plot(m1)

Compare prior-only model to model with data using priorsense package:


m1_draws <- as_draws_df(m1)

# Extract prior and posterior draws
# powerscale_sensitivity(m1, variable = c("b_Intercept", 
#                                         "b_sin_doy1", "b_cos_doy1",
#                                         "b_sin_doy2", "b_cos_doy2",
#                                         "b_sin_doy3", "b_cos_doy3",
#                                         "sdgp_gpxy", "sigma", "Intercept"))
# 
# powerscale_plot_dens(m1, variable = c("b_Intercept", 
#                                         "b_sin_doy1", "b_cos_doy1",
#                                         "b_sin_doy2", "b_cos_doy2",
#                                         "b_sin_doy3", "b_cos_doy3",
#                                         "sdgp_gpxy", "sigma", "Intercept"))

Predictions by space and time. Here, although we only have data within clusters, we will predict for cells outside of clusters based on covariates. We will also predict for the month for whcih we do not have data.

set up the predictions matrix

#set_up prediction date frame
nd_m1 <- mw_100m_grid_sf %>%
  sf::st_sf() %>%
  mutate(x = st_coordinates(st_centroid(.))[, 1],
         y = st_coordinates(st_centroid(.))[, 2]) %>%
  st_drop_geometry() %>%
  rename(pop_density = 1) %>%
  mutate(pop_density_km2 = pop_density / 0.01)
G2;H2;Warningh: There were 2 warnings in `stopifnot()`.
The first warning was:
ℹ In argument: `x = st_coordinates(st_centroid(.))[, 1]`.
Caused by warning:
! st_centroid assumes attributes are constant over geometries
ℹ Run ]8;;ide:run:dplyr::last_dplyr_]8;;ide:run:warnings()warnings()]8;;dplyr::last_dplyr_]8;;ide:run:warnings()warnings()]8;;]8;; to see the 1 remaining warning.g

Get the spatiotemporal predictions for model 1

m1_sum <- get_st_predictions(model = m1, nd=nd_m1, model_id = "m1", out_dir = "epred_chunks_m1", ndraws = 1000, chunk_size = 10000, who_pm25_limit = 25)
Processing chunk 1 of 38 
Processing chunk 2 of 38 
Processing chunk 3 of 38 
Processing chunk 4 of 38 
Processing chunk 5 of 38 
Processing chunk 6 of 38 
Processing chunk 7 of 38 
Processing chunk 8 of 38 
Processing chunk 9 of 38 
Processing chunk 10 of 38 
Processing chunk 11 of 38 
Processing chunk 12 of 38 
Processing chunk 13 of 38 
Processing chunk 14 of 38 
Processing chunk 15 of 38 
Processing chunk 16 of 38 
Processing chunk 17 of 38 
Processing chunk 18 of 38 
Processing chunk 19 of 38 
Processing chunk 20 of 38 
Processing chunk 21 of 38 
Processing chunk 22 of 38 
Processing chunk 23 of 38 
Processing chunk 24 of 38 
Processing chunk 25 of 38 
Processing chunk 26 of 38 
Processing chunk 27 of 38 
Processing chunk 28 of 38 
Processing chunk 29 of 38 
Processing chunk 30 of 38 
Processing chunk 31 of 38 
Processing chunk 32 of 38 
Processing chunk 33 of 38 
Processing chunk 34 of 38 
Processing chunk 35 of 38 
Processing chunk 36 of 38 
Processing chunk 37 of 38 
Processing chunk 38 of 38 

Plot predictions for model 1

m1_plots %>% map(plot)
$mean_log_plot

$mean_plot

$sd_plot

$exceed_plot

Now just draw 50 random coordinates, and predict by time to see whether we captured the trend.

#sample coordinate points from the household dataset
#we will fix the small number sampled later!
set.seed(123) 
sampled_points_m1 <- aq_model_data %>%
  sample_n(50) %>%
  select(x, y)

#generate DOY 0–365 and calculate Fourier terms
doy_grid <- tibble(doy = 0:365) %>%
  mutate(
    sin_doy1 = sin(2 * pi * doy / 365),
    cos_doy1 = cos(2 * pi * doy / 365),
    sin_doy2 = sin(4 * pi * doy / 365),
    cos_doy2 = cos(4 * pi * doy / 365),
    sin_doy3 = sin(6 * pi * doy / 365),
    cos_doy3 = cos(6 * pi * doy / 365)
  )

#expand grid across sampled locations
prediction_df_m1 <- sampled_points_m1 %>%
  crossing(doy_grid)

#add predictions
preds_m1 <- add_epred_draws(object = m1, newdata = prediction_df_m1)

#summarise
preds_m1_sum <- preds_m1 %>%
  ungroup() %>%
  group_by(doy) %>%
  summarise(.epred_mean = mean(.epred),
            .lower = quantile(.epred, probs=0.025),
            .upper = quantile(.epred, probs = 0.975))

#GEt the observed data
observed_data <- aq_model_data %>%
  select(mean_doy, log_pm2_5) 

#plot
preds_m1_sum %>%
  ggplot() +
  geom_ribbon(aes(x=doy, ymin=.lower, ymax = .upper), fill="steelblue", alpha=0.3) +
  geom_line(aes(x=doy, y=.epred_mean)) +
  geom_jitter(data = observed_data, aes(x = mean_doy, y = log_pm2_5),
              color = "darkred", alpha = 0.6, width = 0.5, height = 0.0, size = 1.2) +
  labs(title = "Model-estimated log(PM2.5) with empirical measurements",
       subtitle = "Model predictions with 95% CrI and observed data points",
       x = "Day of year",
       y = "log(PM2.5)",
       caption = "Modelled estimates restricted to within clusters") +
  theme_ggdist() +
  theme(panel.background = element_rect(colour = "grey78"),
        legend.position = "none")

Model 2: With covariates

Now we include grid level covariates of distance to the road, population density, and building footprint percent, along witht the mean temp and huidity on the day of measurement (from the purple air monitors)

Again, priors to be fixed later

priors <- c(
  prior(normal(3.4, 1), class = "Intercept"),
  prior(normal(0,1), class = "b"),
  prior(exponential(1), class = "sigma"),
  prior(exponential(1), class = "sdgp"),
  prior(normal(0, 1), class = "lscale", lb = 0)
)


m2 <- brm(
    formula = log_pm2_5 ~ gp(x, y, k=15)  +
      s(mean_temp_c, k=5) + 
      s(mean_current_humidity, k=5) +
      s(pop_density_km2, k=5) +
      s(building_coverage_pct, k=5) +
      s(dist_to_road_m, k=5) +
      sin_doy1 + cos_doy1 +
      sin_doy2 + cos_doy2 +
      sin_doy3 + cos_doy3,
    data = aq_model_data,
    family = gaussian(),
    prior = priors,
    chains = 4, cores = 4,
    backend = "cmdstanr"
  )
G2;H2;Warningh: The global prior 'normal(0, 1)' of class 'lscale' will not be used in the model as all related coefficients have individual priors already. If you did not set those priors yourself, then maybe brms has assigned default priors. See ?set_prior and ?default_prior for more details.g
G3;Compiling Stan program...
g

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

-

\

|

/

 
G3;Start sampling
g
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 1 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 2 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 4 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 3 Iteration:  100 / 2000 [  5%]  (Warmup) 
Chain 4 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 1 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 2 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 3 Iteration:  200 / 2000 [ 10%]  (Warmup) 
Chain 4 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 1 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 2 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 3 Iteration:  300 / 2000 [ 15%]  (Warmup) 
Chain 4 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 1 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 2 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 3 Iteration:  400 / 2000 [ 20%]  (Warmup) 
Chain 4 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 3 Iteration:  500 / 2000 [ 25%]  (Warmup) 
Chain 4 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 1 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 2 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 3 Iteration:  600 / 2000 [ 30%]  (Warmup) 
Chain 4 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 1 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 2 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 3 Iteration:  700 / 2000 [ 35%]  (Warmup) 
Chain 4 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 1 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 2 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 3 Iteration:  800 / 2000 [ 40%]  (Warmup) 
Chain 1 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 4 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 2 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 3 Iteration:  900 / 2000 [ 45%]  (Warmup) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 4 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 2 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 3 Iteration: 1100 / 2000 [ 55%]  (Sampling) 
Chain 1 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 2 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 4 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 3 Iteration: 1200 / 2000 [ 60%]  (Sampling) 
Chain 2 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 3 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 1 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 4 Iteration: 1300 / 2000 [ 65%]  (Sampling) 
Chain 2 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 3 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 1 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 4 Iteration: 1400 / 2000 [ 70%]  (Sampling) 
Chain 3 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 4 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
Chain 3 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 2 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 1 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 4 Iteration: 1600 / 2000 [ 80%]  (Sampling) 
Chain 3 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 2 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 1 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 3 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 4 Iteration: 1700 / 2000 [ 85%]  (Sampling) 
Chain 2 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 1 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 3 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 2 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 4 Iteration: 1800 / 2000 [ 90%]  (Sampling) 
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 217.0 seconds.
Chain 1 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 220.5 seconds.
Chain 4 Iteration: 1900 / 2000 [ 95%]  (Sampling) 
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 230.3 seconds.
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 234.8 seconds.

All 4 chains finished successfully.
Mean chain execution time: 225.7 seconds.
Total execution time: 235.4 seconds.
G3;Warning: 4 of 4000 (0.0%) transitions ended with a divergence.
See https://mc-stan.org/misc/warnings for details.

g

# m2_draws <- as_draws_df(m2)
# 
# # Extract prior and posterior draws
# powerscale_sensitivity(m2, variable = c("b_Intercept", "b_sin_doy1", "b_cos_doy1", "b_sin_doy2", "b_cos_doy2", "b_sin_doy3", "b_cos_doy3", "bs_smean_temp_c_1", "bs_smean_current_humidity_1", "bs_spop_density_km2_1", "bs_sbuilding_coverage_pct_1", "bs_sdist_to_road_m_1",         
# "sds_smean_temp_c_1", "sds_smean_current_humidity_1", "sds_spop_density_km2_1", "sds_sbuilding_coverage_pct_1", 
# "sds_sdist_to_road_m_1", "sdgp_gpxy", "lscale_gpxy", "sigma", "Intercept", "s_smean_temp_c_1[1]", "s_smean_temp_c_1[2]", "s_smean_temp_c_1[3]",         "s_smean_current_humidity_1[1]",  "s_smean_current_humidity_1[2]",  "s_smean_current_humidity_1[3]",  "s_spop_density_km2_1[1]",       
#  "s_spop_density_km2_1[2]",        "s_spop_density_km2_1[3]",        "s_sbuilding_coverage_pct_1[1]",  "s_sbuilding_coverage_pct_1[2]", 
#  "s_sbuilding_coverage_pct_1[3]",  "s_sdist_to_road_m_1[1]", "s_sdist_to_road_m_1[2]", "s_sdist_to_road_m_1[3]"))
# 
# powerscale_plot_dens(m2, variable = c("b_Intercept", "b_sin_doy1", "b_cos_doy1", "b_sin_doy2", "b_cos_doy2", "b_sin_doy3", "b_cos_doy3", "bs_smean_temp_c_1", "bs_smean_current_humidity_1", "bs_spop_density_km2_1", "bs_sbuilding_coverage_pct_1", "bs_sdist_to_road_m_1",         
# "sds_smean_temp_c_1", "sds_smean_current_humidity_1", "sds_spop_density_km2_1", "sds_sbuilding_coverage_pct_1", 
# "sds_sdist_to_road_m_1", "sdgp_gpxy", "lscale_gpxy", "sigma", "Intercept", "s_smean_temp_c_1[1]", "s_smean_temp_c_1[2]", "s_smean_temp_c_1[3]",         "s_smean_current_humidity_1[1]",  "s_smean_current_humidity_1[2]",  "s_smean_current_humidity_1[3]",  "s_spop_density_km2_1[1]",       
#  "s_spop_density_km2_1[2]",        "s_spop_density_km2_1[3]",        "s_sbuilding_coverage_pct_1[1]",  "s_sbuilding_coverage_pct_1[2]", 
#  "s_sbuilding_coverage_pct_1[3]",  "s_sdist_to_road_m_1[1]", "s_sdist_to_road_m_1[2]", "s_sdist_to_road_m_1[3]"))

Get predictions for model 2

m2_sum <- get_st_predictions(model = m2, nd=nd_m2, model_id = "m2", out_dir = "epred_chunks_m2", ndraws = 1000, chunk_size = 10000, who_pm25_limit = 25)
Processing chunk 1 of 38 
Processing chunk 2 of 38 
Processing chunk 3 of 38 
Processing chunk 4 of 38 
Processing chunk 5 of 38 
Processing chunk 6 of 38 
Processing chunk 7 of 38 
Processing chunk 8 of 38 
Processing chunk 9 of 38 
Processing chunk 10 of 38 
Processing chunk 11 of 38 
Processing chunk 12 of 38 
Processing chunk 13 of 38 
Processing chunk 14 of 38 
Processing chunk 15 of 38 
Processing chunk 16 of 38 
Processing chunk 17 of 38 
Processing chunk 18 of 38 
Processing chunk 19 of 38 
Processing chunk 20 of 38 
Processing chunk 21 of 38 
Processing chunk 22 of 38 
Processing chunk 23 of 38 
Processing chunk 24 of 38 
Processing chunk 25 of 38 
Processing chunk 26 of 38 
Processing chunk 27 of 38 
Processing chunk 28 of 38 
Processing chunk 29 of 38 
Processing chunk 30 of 38 
Processing chunk 31 of 38 
Processing chunk 32 of 38 
Processing chunk 33 of 38 
Processing chunk 34 of 38 
Processing chunk 35 of 38 
Processing chunk 36 of 38 
Processing chunk 37 of 38 
Processing chunk 38 of 38 

Plot predictions for model 2

m2_plots %>% map(plot)
$mean_log_plot

$mean_plot

$sd_plot

$exceed_plot

Sample coordinates, and plot over time


#sample coordinate points
#will sort out the small number later
set.seed(123)
sampled_points_m2 <- aq_model_data %>%
  sample_n(50) %>%
  select(x, y, mean_temp_c, mean_current_humidity, pop_density_km2, building_coverage_pct, dist_to_road_m, grid_id, log_pm2_5)

#generate DOY 0–365 and calculate Fourier terms
doy_grid <- tibble(doy = seq(0,365, by=1)) %>%
  mutate(
    sin_doy1 = sin(2 * pi * doy / 365),
    cos_doy1 = cos(2 * pi * doy / 365),
    sin_doy2 = sin(4 * pi * doy / 365),
    cos_doy2 = cos(4 * pi * doy / 365),
    sin_doy3 = sin(6 * pi * doy / 365),
    cos_doy3 = cos(6 * pi * doy / 365)
  )

#expand grid across sampled locations
prediction_df_m2 <- sampled_points_m2 %>%
  crossing(doy_grid)

#add predictions
preds_m2 <- add_epred_draws(object = m2, newdata = prediction_df_m2, ndraws = 100)

#observed data for plotting
observed_data <- aq_model_data %>%
  select(mean_doy, log_pm2_5) %>%
  mutate(mean_doy = round(mean_doy))

#plot
preds_m2 %>%
  ungroup() %>%
  ggplot(aes(x=doy)) +
  stat_lineribbon(aes(y=.epred), .width=0.95) + 
  geom_jitter(data = observed_data, aes(x = mean_doy, y = log_pm2_5),
              color = "darkred", alpha = 0.6, width = 0.5, height = 0.0, size = 1.2) +
  scale_fill_brewer() +
  labs(title = "Model-estimated log(PM2.5) with empirical measurements",
       subtitle = "Model predictions with 95% CrI and observed data points",
       x = "Day of year",
       y = "log(PM2.5)",
       caption = "Modelled estimates restricted to within clusters") +
  theme_ggdist() +
  theme(panel.background = element_rect(colour = "grey78"),
        legend.position = "none")

Compare models, using LOO CV


library(loo)

loo_m1 <- loo(m1)
loo_m2 <- loo(m2)

loo_compare(loo_m1, loo_m2)

PM10

Now to model PM10 data

TODO

---
title: "Blantyre Air Quality Modelling"
output: html_notebook
---

## Libraries

Load required libraries

```{r}
library(tidyverse)
library(here)
library(brms)
library(tidybayes)
library(sf)
library(mgcv)
library(priorsense)
library(osmdata)
library(nngeo)
library(glue)
library(arrow)
```

## Import data

Import the modelling dataset - cleaned and prepped in the `2025-05-12_aq_clean.Rmd` script

```{r}

aq_model_data <- read_rds("input_data/aq_model_data.rds")

#also load the scale clusters
load("input_data/scale_72_clusters.rda") #SCALE clusters

#and the grid data
mw_100m_cropped <- read_rds("input_data/mw_100m_cropped.rds")
mw_100m_grid_sf <- read_rds("input_data/mw_100m_grid_sf.rds")


```


## Functions

`get_st_predictions()`: A function to take posterior draws for a prediction matrix, write to an `arrow` database (to handle exhausting memory), then summarise

```{r}

get_st_predictions <- function(model, nd, model_id, out_dir, ndraws = 1000, chunk_size = 10000, who_pm25_limit = 25) {
  # Create output folder
  dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)

  nd <- nd %>% mutate(.row = row_number())
  n_chunks <- ceiling(nrow(nd) / chunk_size)

  # Save design matrix to disk for later joins
  nd_file <- file.path(out_dir, paste0("nd_", model_id, ".parquet"))
  arrow::write_parquet(nd, nd_file)

  # Loop over chunks
  for (i in seq_len(n_chunks)) {
    cat("Processing chunk", i, "of", n_chunks, "\n")

    idx <- ((i - 1) * chunk_size + 1):min(i * chunk_size, nrow(nd))
    nd_chunk <- nd[idx, ]

    epred_array <- posterior_epred(model, newdata = nd_chunk, ndraws = ndraws)

    epred_df <- as.data.frame.table(epred_array, responseName = "epred")
    colnames(epred_df) <- c("draw", "row_in_chunk", "epred")

    epred_df <- epred_df %>%
      mutate(
        draw = as.integer(draw),
        row_in_chunk = as.integer(row_in_chunk),
        .row = idx[row_in_chunk],
        model_id = model_id
      ) %>%
      select(draw, .row, epred, model_id)

    arrow::write_parquet(
      epred_df,
      sink = file.path(out_dir, glue::glue("epred_chunk_{i}.parquet"))
    )

    rm(epred_array, epred_df, nd_chunk)
    gc()
  }

  # Read from disk
  ds <- arrow::open_dataset(out_dir)
  nd_ds <- arrow::open_dataset(nd_file)

  summary_df <- ds %>%
    mutate(epred_pm25 = exp(epred)) %>%
    group_by(.row) %>%
    summarise(
      mean_log_epred = mean(epred, na.rm = TRUE),
      sd_log_epred = sd(epred, na.rm = TRUE),
      lwr_log = quantile(epred, 0.025, na.rm = TRUE),
      upr_log = quantile(epred, 0.975, na.rm = TRUE),
      
      mean_epred = mean(epred_pm25, na.rm = TRUE),
      sd_epred = sd(epred_pm25, na.rm = TRUE),
      lwr = quantile(epred_pm25, 0.025, na.rm = TRUE),
      upr = quantile(epred_pm25, 0.975, na.rm = TRUE),
      
      prob_exceed = mean(epred_pm25 > who_pm25_limit, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    collect() %>%
    left_join(nd, by = ".row") %>%
    mutate(date = lubridate::month(date, label = TRUE))

  return(summary_df)
}


```

`plot_st_predictions()`: A function to plot spatiotemporal predictions from model posteriors

```{r}

plot_st_predictions <- function(summary_df, 
                                cluster_sf,
                                value_limits = list(mean = NULL, sd = NULL, prob = c(0, 1))) {
  require(ggplot2)
  require(viridis)
  require(ggdist)
  require(dplyr)
  require(lubridate)

  # Mean prediction plotss
    p1 <- summary_df %>%
    ggplot() +
    geom_tile(aes(x = x, y = y, fill = mean_log_epred)) +
    geom_sf(data = cluster_sf, colour = "grey78", fill = NA) +
    scale_fill_viridis_c(option = "D", limits = value_limits$mean) +
    facet_wrap(~date) +
    labs(title = "Posterior mean of log(PM2.5) (µg/m³)", x = "", y = "") +
    theme_ggdist() +
    theme(panel.background = element_rect(fill = NA, colour = "grey78"),
          axis.text.x = element_text(angle = 45, hjust = 1),
          strip.background = element_rect(colour = "grey78"))
    
  p2 <- summary_df %>%
    ggplot() +
    geom_tile(aes(x = x, y = y, fill = mean_epred)) +
    geom_sf(data = cluster_sf, colour = "grey78", fill = NA) +
    scale_fill_viridis_c(option = "D", limits = value_limits$mean) +
    facet_wrap(~date) +
    labs(title = "Posterior mean of PM2.5 (µg/m³)", x = "", y = "") +
    theme_ggdist() +
    theme(panel.background = element_rect(fill = NA, colour = "grey78"),
          axis.text.x = element_text(angle = 45, hjust = 1),
          strip.background = element_rect(colour = "grey78"))

  # Standard deviation plot
  p3 <- summary_df %>%
    ggplot() +
    geom_tile(aes(x = x, y = y, fill = sd_epred)) +
    geom_sf(data = cluster_sf, colour = "grey78", fill = NA) +
    scale_fill_viridis_c(option = "F", limits = value_limits$sd) +
    facet_wrap(~date) +
    labs(title = "Posterior SD of PM2.5 (µg/m³)", x = "", y = "") +
    theme_ggdist() +
    theme(panel.background = element_rect(fill = NA, colour = "grey78"),
          axis.text.x = element_text(angle = 45, hjust = 1),
          strip.background = element_rect(colour = "grey78"))

  # Exceedance probability plot
  p4 <- summary_df %>%
    ggplot() +
    geom_tile(aes(x = x, y = y, fill = prob_exceed)) +
    geom_sf(data = cluster_sf, colour = "grey78", fill = NA) +
    scale_fill_viridis_c(option = "C", labels = scales::percent_format(), limits = value_limits$prob) +
    facet_wrap(~date) +
    labs(title = "Pr(PM2.5 > 25 µg/m³)", x = "", y = "") +
    theme_ggdist() +
    theme(panel.background = element_rect(fill = NA, colour = "grey78"),
          axis.text.x = element_text(angle = 45, hjust = 1),
          strip.background = element_rect(colour = "grey78"))

  list(mean_log_plot = p1, mean_plot = p2, sd_plot = p3, exceed_plot = p4)
}




```


`compute_exposure_metrics()` calculates exposure metrics for the 1 year p


## PM2.5

First we will model PM2.5

### Model 1: Day of year only


"We fitted a spatially smooth regression model for log-transformed PM2.5 concentrations using a Gaussian process (GP) smooth over spatial coordinates (x, y) to capture flexible spatial trends. Seasonal variation was modelled using a Fourier series expansion of day-of-year up to the 3rd harmonic (i.e., sine and cosine terms for annual, semi-annual, and tri-annual cycles) to account for complex seasonal patterns. The model was fitted in a Bayesian framework using brms with a Gaussian likelihood and the cmdstanr backend."

```{r, fig.width=12, fig.height=12}

priors <- c(
  prior(normal(3.4, 1), class = "Intercept"),
  prior(normal(0,1), class = "b"),
  prior(exponential(1), class = "sigma"),
  prior(exponential(1), class = "sdgp"),
  prior(normal(0, 1), class = "lscale", lb = 0)
)

```

Fit prior only model

```{r, fig.width=12, fig.height=12}

m1_prior <- brm(
    formula = log_pm2_5 ~ gp(x, y, k=15) +
      sin_doy1 + cos_doy1 +
      sin_doy2 + cos_doy2 +
      sin_doy3 + cos_doy3,
    data = aq_model_data,
    family = gaussian(),
    prior = priors,
    sample_prior = "only",
    chains = 4, cores = 4,
    backend = "cmdstanr"
  )


# summary(m1_prior)
# plot(m1_prior)

```

Now model with data.


```{r, fig.width=12, fig.height=12}
m1 <- brm(
    formula = log_pm2_5 ~ gp(x, y, k=15)  +
    sin_doy1 + cos_doy1 +
    sin_doy2 + cos_doy2 +
    sin_doy3 + cos_doy3,
    data = aq_model_data,
    family = gaussian(),
    prior = priors,
    chains = 4, cores = 4,
    backend = "cmdstanr"
  )

# summary(m1)
# conditional_effects(m1)
# pp_check(m1)
# plot(m1)

```

Compare prior-only model to model with data using `priorsense` package:

```{r, fig.width=12, fig.height=12}

m1_draws <- as_draws_df(m1)

# Extract prior and posterior draws
# powerscale_sensitivity(m1, variable = c("b_Intercept", 
#                                         "b_sin_doy1", "b_cos_doy1",
#                                         "b_sin_doy2", "b_cos_doy2",
#                                         "b_sin_doy3", "b_cos_doy3",
#                                         "sdgp_gpxy", "sigma", "Intercept"))
# 
# powerscale_plot_dens(m1, variable = c("b_Intercept", 
#                                         "b_sin_doy1", "b_cos_doy1",
#                                         "b_sin_doy2", "b_cos_doy2",
#                                         "b_sin_doy3", "b_cos_doy3",
#                                         "sdgp_gpxy", "sigma", "Intercept"))
```

Predictions by space and time. Here, although we only have data within clusters, we will predict for cells outside of clusters based on covariates.
We will also predict for the month for whcih we do not have data.

set up the predictions matrix

```{r, fig.width=12, fig.height=12}

#set_up prediction date frame
nd_m1 <- mw_100m_grid_sf %>%
  sf::st_sf() %>%
  mutate(x = st_coordinates(st_centroid(.))[, 1],
         y = st_coordinates(st_centroid(.))[, 2]) %>%
  st_drop_geometry() %>%
  rename(pop_density = 1) %>%
  mutate(pop_density_km2 = pop_density / 0.01)

#get the first day of each month for prediction
first_days <- ymd(paste0("2020-", sprintf("%02d", 1:12), "-01"))
first_day_doy <- yday(first_days)

#calculate Fourier terms
first_day_seasonality <- tibble(
  mean_doy = first_day_doy,
  sin_doy1 = sin(2 * pi * mean_doy / 365.25),
  cos_doy1 = cos(2 * pi * mean_doy / 365.25),
  sin_doy2 = sin(4 * pi * mean_doy / 365.25),
  cos_doy2 = cos(4 * pi * mean_doy / 365.25),
  sin_doy3 = sin(6 * pi * mean_doy / 365.25),
  cos_doy3 = cos(6 * pi * mean_doy / 365.25),
  date = first_days
)

#expand grid for prediction
nd_m1 <- nd_m1 %>%
  crossing(first_day_seasonality) %>%
  mutate(building_coverage_pct = building_coverage_pct/100)

```


Get the spatiotemporal predictions for model 1


```{r}
m1_sum <- get_st_predictions(model = m1, nd=nd_m1, model_id = "m1", out_dir = "epred_chunks_m1", ndraws = 1000, chunk_size = 10000, who_pm25_limit = 25)

```

Plot predictions for model 1

```{r}
m1_plots <- plot_st_predictions(summary_df = m1_sum, cluster_sf = scale_72_clusters)

m1_plots %>% map(plot)

```



Now just draw 50 random coordinates, and predict by time to see whether we captured the trend.

```{r, fig.width=12, fig.height=12}
#sample coordinate points from the household dataset
#we will fix the small number sampled later!
set.seed(123) 
sampled_points_m1 <- aq_model_data %>%
  sample_n(50) %>%
  select(x, y)

#generate DOY 0–365 and calculate Fourier terms
doy_grid <- tibble(doy = 0:365) %>%
  mutate(
    sin_doy1 = sin(2 * pi * doy / 365),
    cos_doy1 = cos(2 * pi * doy / 365),
    sin_doy2 = sin(4 * pi * doy / 365),
    cos_doy2 = cos(4 * pi * doy / 365),
    sin_doy3 = sin(6 * pi * doy / 365),
    cos_doy3 = cos(6 * pi * doy / 365)
  )

#expand grid across sampled locations
prediction_df_m1 <- sampled_points_m1 %>%
  crossing(doy_grid)

#add predictions
preds_m1 <- add_epred_draws(object = m1, newdata = prediction_df_m1)

#summarise
preds_m1_sum <- preds_m1 %>%
  ungroup() %>%
  group_by(doy) %>%
  summarise(.epred_mean = mean(.epred),
            .lower = quantile(.epred, probs=0.025),
            .upper = quantile(.epred, probs = 0.975))

#GEt the observed data
observed_data <- aq_model_data %>%
  select(mean_doy, log_pm2_5) 

#plot
preds_m1_sum %>%
  ggplot() +
  geom_ribbon(aes(x=doy, ymin=.lower, ymax = .upper), fill="steelblue", alpha=0.3) +
  geom_line(aes(x=doy, y=.epred_mean)) +
  geom_jitter(data = observed_data, aes(x = mean_doy, y = log_pm2_5),
              color = "darkred", alpha = 0.6, width = 0.5, height = 0.0, size = 1.2) +
  labs(title = "Model-estimated log(PM2.5) with empirical measurements",
       subtitle = "Model predictions with 95% CrI and observed data points",
       x = "Day of year",
       y = "log(PM2.5)",
       caption = "Modelled estimates restricted to within clusters") +
  theme_ggdist() +
  theme(panel.background = element_rect(colour = "grey78"),
        legend.position = "none")

```

### Model 2: With covariates

Now we include grid level covariates of distance to the road, population density, and building footprint percent, along witht the mean temp and huidity on the day of measurement (from the purple air monitors)

Again, priors to be fixed later

```{r, fig.width=12, fig.height=12}
priors <- c(
  prior(normal(3.4, 1), class = "Intercept"),
  prior(normal(0,1), class = "b"),
  prior(exponential(1), class = "sigma"),
  prior(exponential(1), class = "sdgp"),
  prior(normal(0, 1), class = "lscale", lb = 0)
)


m2 <- brm(
    formula = log_pm2_5 ~ gp(x, y, k=15)  +
      s(mean_temp_c, k=5) + 
      s(mean_current_humidity, k=5) +
      s(pop_density_km2, k=5) +
      s(building_coverage_pct, k=5) +
      s(dist_to_road_m, k=5) +
      sin_doy1 + cos_doy1 +
      sin_doy2 + cos_doy2 +
      sin_doy3 + cos_doy3,
    data = aq_model_data,
    family = gaussian(),
    prior = priors,
    chains = 4, cores = 4,
    backend = "cmdstanr"
  )

summary(m2)
conditional_effects(m2)
pp_check(m2)
plot(m2)


```

```{r, fig.width=12, fig.height=12}

# m2_draws <- as_draws_df(m2)
# 
# # Extract prior and posterior draws
# powerscale_sensitivity(m2, variable = c("b_Intercept", "b_sin_doy1", "b_cos_doy1", "b_sin_doy2", "b_cos_doy2", "b_sin_doy3", "b_cos_doy3", "bs_smean_temp_c_1", "bs_smean_current_humidity_1", "bs_spop_density_km2_1", "bs_sbuilding_coverage_pct_1", "bs_sdist_to_road_m_1",         
# "sds_smean_temp_c_1", "sds_smean_current_humidity_1", "sds_spop_density_km2_1", "sds_sbuilding_coverage_pct_1", 
# "sds_sdist_to_road_m_1", "sdgp_gpxy", "lscale_gpxy", "sigma", "Intercept", "s_smean_temp_c_1[1]", "s_smean_temp_c_1[2]", "s_smean_temp_c_1[3]",         "s_smean_current_humidity_1[1]",  "s_smean_current_humidity_1[2]",  "s_smean_current_humidity_1[3]",  "s_spop_density_km2_1[1]",       
#  "s_spop_density_km2_1[2]",        "s_spop_density_km2_1[3]",        "s_sbuilding_coverage_pct_1[1]",  "s_sbuilding_coverage_pct_1[2]", 
#  "s_sbuilding_coverage_pct_1[3]",  "s_sdist_to_road_m_1[1]", "s_sdist_to_road_m_1[2]", "s_sdist_to_road_m_1[3]"))
# 
# powerscale_plot_dens(m2, variable = c("b_Intercept", "b_sin_doy1", "b_cos_doy1", "b_sin_doy2", "b_cos_doy2", "b_sin_doy3", "b_cos_doy3", "bs_smean_temp_c_1", "bs_smean_current_humidity_1", "bs_spop_density_km2_1", "bs_sbuilding_coverage_pct_1", "bs_sdist_to_road_m_1",         
# "sds_smean_temp_c_1", "sds_smean_current_humidity_1", "sds_spop_density_km2_1", "sds_sbuilding_coverage_pct_1", 
# "sds_sdist_to_road_m_1", "sdgp_gpxy", "lscale_gpxy", "sigma", "Intercept", "s_smean_temp_c_1[1]", "s_smean_temp_c_1[2]", "s_smean_temp_c_1[3]",         "s_smean_current_humidity_1[1]",  "s_smean_current_humidity_1[2]",  "s_smean_current_humidity_1[3]",  "s_spop_density_km2_1[1]",       
#  "s_spop_density_km2_1[2]",        "s_spop_density_km2_1[3]",        "s_sbuilding_coverage_pct_1[1]",  "s_sbuilding_coverage_pct_1[2]", 
#  "s_sbuilding_coverage_pct_1[3]",  "s_sdist_to_road_m_1[1]", "s_sdist_to_road_m_1[2]", "s_sdist_to_road_m_1[3]"))
```


Get predictions for model 2

```{r}
m2_sum <- get_st_predictions(model = m2, nd=nd_m2, model_id = "m2", out_dir = "epred_chunks_m2", ndraws = 1000, chunk_size = 10000, who_pm25_limit = 25)

```

Plot predictions for model 2

```{r}
m2_plots <- plot_st_predictions(summary_df = m2_sum, cluster_sf = scale_72_clusters)

m2_plots %>% map(plot)

```


Sample coordinates, and plot over time

```{r, fig.width=12, fig.height=12}

#sample coordinate points
#will sort out the small number later
set.seed(123)
sampled_points_m2 <- aq_model_data %>%
  sample_n(50) %>%
  select(x, y, mean_temp_c, mean_current_humidity, pop_density_km2, building_coverage_pct, dist_to_road_m, grid_id, log_pm2_5)

#generate DOY 0–365 and calculate Fourier terms
doy_grid <- tibble(doy = seq(0,365, by=1)) %>%
  mutate(
    sin_doy1 = sin(2 * pi * doy / 365),
    cos_doy1 = cos(2 * pi * doy / 365),
    sin_doy2 = sin(4 * pi * doy / 365),
    cos_doy2 = cos(4 * pi * doy / 365),
    sin_doy3 = sin(6 * pi * doy / 365),
    cos_doy3 = cos(6 * pi * doy / 365)
  )

#expand grid across sampled locations
prediction_df_m2 <- sampled_points_m2 %>%
  crossing(doy_grid)

#add predictions
preds_m2 <- add_epred_draws(object = m2, newdata = prediction_df_m2, ndraws = 100)

#observed data for plotting
observed_data <- aq_model_data %>%
  select(mean_doy, log_pm2_5) %>%
  mutate(mean_doy = round(mean_doy))

#plot
preds_m2 %>%
  ungroup() %>%
  ggplot(aes(x=doy)) +
  stat_lineribbon(aes(y=.epred), .width=0.95) + 
  geom_jitter(data = observed_data, aes(x = mean_doy, y = log_pm2_5),
              color = "darkred", alpha = 0.6, width = 0.5, height = 0.0, size = 1.2) +
  scale_fill_brewer() +
  labs(title = "Model-estimated log(PM2.5) with empirical measurements",
       subtitle = "Model predictions with 95% CrI and observed data points",
       x = "Day of year",
       y = "log(PM2.5)",
       caption = "Modelled estimates restricted to within clusters") +
  theme_ggdist() +
  theme(panel.background = element_rect(colour = "grey78"),
        legend.position = "none")



```


Compare models, using LOO CV

```{r}

library(loo)

loo_m1 <- loo(m1)
loo_m2 <- loo(m2)

loo_compare(loo_m1, loo_m2)
```


## PM10

Now to model PM10 data



TODO

 - OTHER OUTCOME MEASURES (PM1, PM10, NO)
 - CAN WE LINK TO THE STATIC AQ MONITORS (OUTDOORS)
 - PRIORS
 - OTHER COVARIATES
 - COMPARE MODELS WITH DIFFERENT BASIS FUNCTIONS FOR GP AND SPLINES
 - EXCEEDANCES
 - LINK TO MTB INFECTION PREVALENCE DATA, AND MODEL
